Analysis of DESeq2 results with gene set enrichment

0 Boilerplate

Libraries & external code needed:

Set working directory where the script is

if (Sys.getenv("USER")=="JQ") {
  setwd("/Users/JQ/Documents/_CODE REPOS/GitHub/Da_RNAseq")
} else if (Sys.getenv("RSTUDIO")==1) {
  setwd( dirname(rstudioapi::getSourceEditorContext(id = NULL)$path) ) # gets what is in the editor
} else {
  setwd(here::here())
  d <- str_split(getwd(),'/')[[1]][length(str_split(getwd(),'/')[[1]])]
  if (d != 'Da_RNAseq') { stop(
    paste0("Could not set working directory automatically to where this",
           " script resides.\nPlease do `setwd()` manually"))
    }
}
getwd()
## [1] "/Users/JQ/Documents/_CODE REPOS/GitHub/Da_RNAseq"

Path to definitive images (outside repo):

figdir <- paste0(c(head(str_split(getwd(),'/')[[1]],-1),
                   paste0(tail(str_split(getwd(),'/')[[1]],1), '_figures')),
                 collapse='/')
dir.create(figdir, showWarnings = FALSE)

1 Getting the data

This gets us the DGE data from DESeq2, identified by FlyBase/Ensembl ID and gene symbol:

# experimental design and labels
targets <- readRDS('output/targets.RDS')
# DEG data
DaKD_deg <- readRDS('output/Control_vs_DaKD.RDS')
DaOE_deg <- readRDS('output/Control_vs_DaOE.RDS')
DaDaOE_deg <- readRDS('output/Control_vs_DaDaOE.RDS')
ScOE_deg <- readRDS('output/Control_vs_ScOE.RDS')
head(
  ScOE_deg %>% dplyr::select(gene_symbol, ensemblGeneID, baseMean, log2FoldChange, padj),
  1
)

2 Over-representation of Gene Ontology terms

For daughterless knockdown:

# using `clusterProfiler` and `enrichplot`
# select reg set
daKD_up <- make_degset(DaKD_deg, up=TRUE, fc_thresh=1.5)
# ORA
daKD_up_eGO <- enrichGO(gene         = daKD_up$gene_symbol,
                        OrgDb         = org.Dm.eg.db,
                        keyType       = 'SYMBOL',
                        pAdjustMethod = "BH",
                        pvalueCutoff  = 0.05,
                        qvalueCutoff  = 0.05)
# with down genes
daKD_dn <- make_degset(DaKD_deg, up=FALSE, fc_thresh=1.5)
daKD_dn_eGO <- enrichGO(gene         = daKD_dn$gene_symbol,
                        OrgDb         = org.Dm.eg.db,
                        keyType       = 'SYMBOL',
                        pAdjustMethod = "BH",
                        pvalueCutoff  = 0.05,
                        qvalueCutoff  = 0.05)
# plot
up <- dotplot(daKD_up_eGO, showCategory=30, label_format=50) +
  ggtitle("Over-represented GO terms — genes **up** in *esg > da^RNAi^*") +
  scale_colour_gradient(low = cet_pal(3, name='cbd1')[2],
                        high = cet_pal(3, name='cbd1')[3]) +
  theme(plot.title = element_markdown(size=rel(1.2)),
        panel.grid = element_blank(),
        axis.text.y = element_text(size=8, lineheight=0.5))

down <- dotplot(daKD_dn_eGO, showCategory=30, label_format=40) +
  ggtitle("Over-represented GO terms — genes **down** in *esg > da^RNAi^*") +
  scale_colour_gradient(low = cet_pal(3, name='cbd1')[2],
                        high = cet_pal(3, name='cbd1')[1]) +
  theme(plot.title = element_markdown(),
        panel.grid = element_blank(),
        axis.text.y = element_text(size=9))

suppressWarnings(print(up))

suppressWarnings(print(down))

For daughterless overexpression:

# using `clusterProfiler` and `enrichplot`
# select reg set
daOE_up <- make_degset(DaOE_deg, up=TRUE, fc_thresh=1.5)
# ORA
daOE_up_eGO <- enrichGO(gene         = daOE_up$gene_symbol,
                        OrgDb         = org.Dm.eg.db,
                        keyType       = 'SYMBOL',
                        pAdjustMethod = "BH",
                        pvalueCutoff  = 0.05,
                        qvalueCutoff  = 0.05)
# with down genes
daOE_dn <- make_degset(DaOE_deg, up=FALSE, fc_thresh=1.5)
daOE_dn_eGO <- enrichGO(gene         = daOE_dn$gene_symbol,
                        OrgDb         = org.Dm.eg.db,
                        keyType       = 'SYMBOL',
                        pAdjustMethod = "BH",
                        pvalueCutoff  = 0.05,
                        qvalueCutoff  = 0.05)
# plot
up <- dotplot(daOE_up_eGO, showCategory=30, label_format=60) +
  ggtitle("Over-represented GO terms — genes **up** in *esg > da*") +
  scale_colour_gradient(low = cet_pal(3, name='cbd1')[2],
                        high = cet_pal(3, name='cbd1')[3]) +
  theme(plot.title = element_markdown(size=rel(1.15)),
        panel.grid = element_blank(),
        axis.text.y = element_text(size=9))

down <- dotplot(daOE_dn_eGO, showCategory=30, label_format=60) +
  ggtitle("Over-represented GO terms — genes **down** in *esg > da*") +
  scale_colour_gradient(low = cet_pal(3, name='cbd1')[2],
                        high = cet_pal(3, name='cbd1')[1]) +
  theme(plot.title = element_markdown(size=rel(1.15)),
        panel.grid = element_blank(),
        axis.text.y = element_text(size=8, lineheight=0.5))

suppressWarnings(print(up))

suppressWarnings(print(down))

For da:da overexpression:

# using `clusterProfiler` and `enrichplot`
# select reg set
dadaOE_up <- make_degset(DaDaOE_deg, up=TRUE, fc_thresh=1.5)
# ORA
dadaOE_up_eGO <- enrichGO(gene         = dadaOE_up$gene_symbol,
                        OrgDb         = org.Dm.eg.db,
                        keyType       = 'SYMBOL',
                        pAdjustMethod = "BH",
                        pvalueCutoff  = 0.05,
                        qvalueCutoff  = 0.05)
# with down genes
dadaOE_dn <- make_degset(DaDaOE_deg, up = FALSE, fc_thresh = 1.5)
dadaOE_dn_eGO <- enrichGO(gene        = dadaOE_dn$gene_symbol,
                        OrgDb         = org.Dm.eg.db,
                        keyType       = 'SYMBOL',
                        pAdjustMethod = "BH",
                        pvalueCutoff  = 0.05,
                        qvalueCutoff  = 0.05)
# plot
up <- dotplot(dadaOE_up_eGO, showCategory=30, label_format=40) +
  ggtitle("Over-represented GO terms — genes **up** in *esg > da:da*") +
  scale_colour_gradient(low = cet_pal(3, name='cbd1')[2],
                        high = cet_pal(3, name='cbd1')[3]) +
  force_panelsizes(rows = unit(3, "in"), cols = unit(2, "in")) +
  theme(plot.title = element_markdown(),
        plot.title.position = 'plot',
        panel.grid = element_blank(),
        axis.text.y = element_text(size=9))

down <- dotplot(dadaOE_dn_eGO, showCategory=30, label_format=40) +
  ggtitle("Over-represented GO terms — genes **down** in *esg > da:da*") +
  scale_colour_gradient(low = cet_pal(3, name='cbd1')[2],
                        high = cet_pal(3, name='cbd1')[1]) +
  force_panelsizes(rows = unit(3, "in"), cols = unit(2, "in")) +
  theme(plot.title = element_markdown(),
        plot.title.position = 'plot',
        panel.grid = element_blank(),
        axis.text.y = element_text(size=9))

tryCatch({ print(up) },
  error = function (e) {
    message('Error occurred when trying to plot dotplot')
    print(e)})
## <simpleError in ans[ypos] <- rep(yes, length.out = len)[ypos]: replacement has length zero>
tryCatch({ print(down) },
  error = function (e) {
    message('Error occurred when trying to plot dotplot')
    print(e)})

For scute overexpression:

# using `clusterProfiler` and `enrichplot`
# select reg set
scOE_up <- make_degset(ScOE_deg, up=TRUE, fc_thresh=1.5)
# ORA
scOE_up_eGO <- enrichGO(gene         = scOE_up$gene_symbol,
                        OrgDb         = org.Dm.eg.db,
                        keyType       = 'SYMBOL',
                        pAdjustMethod = "BH",
                        pvalueCutoff  = 0.05,
                        qvalueCutoff  = 0.05)
# with down genes
scOE_dn <- make_degset(ScOE_deg, up=FALSE, fc_thresh=1.5)
scOE_dn_eGO <- enrichGO(gene         = scOE_dn$gene_symbol,
                        OrgDb         = org.Dm.eg.db,
                        keyType       = 'SYMBOL',
                        pAdjustMethod = "BH",
                        pvalueCutoff  = 0.05,
                        qvalueCutoff  = 0.05)
# plot
up <- dotplot(scOE_up_eGO, showCategory=30, label_format=60) +
  ggtitle("Over-represented GO terms — genes **up** in *esg > scute*") +
  scale_colour_gradient(low = cet_pal(3, name='cbd1')[2],
                        high = cet_pal(3, name='cbd1')[3]) +
  theme(plot.title = element_markdown(),
        panel.grid = element_blank(),
        axis.text.y = element_text(size=9))

down <- dotplot(scOE_dn_eGO, showCategory=30, label_format=60) +
  ggtitle("Over-represented GO terms — genes **down** in *esg > scute*") +
  scale_colour_gradient(low = cet_pal(3, name='cbd1')[2],
                        high = cet_pal(3, name='cbd1')[1]) +
  theme(plot.title = element_markdown(),
        panel.grid = element_blank(),
        axis.text.y = element_text(size=8, lineheight=0.5))

suppressWarnings(print(up))

suppressWarnings(print(down))

None of these plots are particularly informative, so I am going to move on to GSEA and custom gene sets.

3 Gene Set Enrichment Analysis with Gene Ontology terms

For daughterless knockdown:

# using `clusterProfiler` and `enrichplot`
# select reg set
daKD_rank <- make_degrank(DaKD_deg, mode='log2fc')
# GSEA
daKD_GSEgo <- gseGO(geneList=daKD_rank,
                    keyType = 'SYMBOL',
                    OrgDb=org.Dm.eg.db,
                    minGSSize    = 15,
                    maxGSSize    = 1000,
                    pvalueCutoff = 0.05,
                    eps = 0,
                    verbose=F)
# get the logP values instead of padj 
daKD_GSEgo_log <- daKD_GSEgo
daKD_GSEgo_log@result$p.adjust <- -log10(daKD_GSEgo_log@result$p.adjust) # use @slot and $column to assign
# to subset the data to the highest NES values
#daKD_GSEgo_log@result <- subset(daKD_GSEgo_log@result, abs(NES)>1.8)
# to subset the data to the lowest p-values:
number_of_terms <- 30
cutoff <- rev(rev(sort(daKD_GSEgo_log$p.adjust))[1:number_of_terms])[1]
daKD_GSEgo_log@result <- subset(daKD_GSEgo_log@result, p.adjust>=cutoff)
# to control colour of GO terms according to enrichment or depletion
d <- ifelse(sort(daKD_GSEgo_log$NES) > 0, cet_pal(2, name='cbd1')[2], cet_pal(2, name='cbd1')[1])
# plot
gsep <- dotplot(daKD_GSEgo_log,
                x='NES', 
                showCategory=number_of_terms,
                label_format=60,
                ) +
  ggtitle("Enriched/depleted GO terms in *esg > da^RNAi^*") +
  scale_colour_gradient(low = cet_pal(3, name='d2')[2],
                        high = cet_pal(3, name='d2')[3]) +
  labs(color = '-log~10~(*p*)') +
  theme(plot.title = element_markdown(),
        plot.title.position = 'panel',
        legend.title = element_markdown(),
        panel.grid = element_blank(),
        axis.text.y = element_text(size=9, colour=d))

suppressWarnings(print(gsep))

For daughterless overexpression:

# using `clusterProfiler` and `enrichplot`
# select reg set
daOE_rank <- make_degrank(DaOE_deg, mode='log2fc')
# GSEA
daOE_GSEgo <- gseGO(geneList=daOE_rank,
                    keyType = 'SYMBOL',
                    OrgDb=org.Dm.eg.db,
                    minGSSize    = 15,
                    maxGSSize    = 1000,
                    pvalueCutoff = 0.05,
                    eps = 0,
                    verbose=F)
# get the logP values instead of padj 
daOE_GSEgo_log <- daOE_GSEgo
daOE_GSEgo_log@result$p.adjust <- -log10(daOE_GSEgo_log@result$p.adjust) # use @slot and $column to assign
# to subset the data to the highest NES values
#daOE_GSEgo_log@result <- subset(daOE_GSEgo_log@result, abs(NES)>1.8)
# to subset the data to the lowest p-values:
number_of_terms <- 30
cutoff <- rev(rev(sort(daOE_GSEgo_log$p.adjust))[1:number_of_terms])[1]
daOE_GSEgo_log@result <- subset(daOE_GSEgo_log@result, p.adjust>=cutoff)
# to control colour of GO terms according to enrichment or depletion
d <- ifelse(sort(daOE_GSEgo_log$NES) > 0, cet_pal(2, name='cbd1')[2], cet_pal(2, name='cbd1')[1])
# plot
gsep <- dotplot(daOE_GSEgo_log,
                x='NES', 
                showCategory=number_of_terms,
                label_format=60,
                ) +
  ggtitle("Enriched/depleted GO terms in *esg > da*") +
  scale_colour_gradient(low = cet_pal(3, name='d2')[2],
                        high = cet_pal(3, name='d2')[3]) +
  labs(color = '-log~10~(*p*)') +
  theme(plot.title = element_markdown(),
        plot.title.position = 'panel',
        legend.title = element_markdown(),
        panel.grid = element_blank(),
        axis.text.y = element_text(size=9, colour=d))

suppressWarnings(print(gsep))

For da:da overexpression:

# using `clusterProfiler` and `enrichplot`
# select reg set
dadaOE_rank <- make_degrank(DaDaOE_deg, mode='log2fc')
# GSEA
dadaOE_GSEgo <- gseGO(geneList=dadaOE_rank,
                    keyType = 'SYMBOL',
                    OrgDb=org.Dm.eg.db,
                    minGSSize    = 15,
                    maxGSSize    = 1000,
                    pvalueCutoff = 0.05,
                    eps = 0,
                    verbose=F)
# get the logP values instead of padj 
dadaOE_GSEgo_log <- dadaOE_GSEgo
dadaOE_GSEgo_log@result$p.adjust <- -log10(dadaOE_GSEgo_log@result$p.adjust) # use @slot and $column to assign
# to subset the data to the highest NES values
#dadaOE_GSEgo_log@result <- subset(dadaOE_GSEgo_log@result, abs(NES)>1.8)
# to subset the data to the lowest p-values:
number_of_terms <- 30
cutoff <- rev(rev(sort(dadaOE_GSEgo_log$p.adjust))[1:number_of_terms])[1]
dadaOE_GSEgo_log@result <- subset(dadaOE_GSEgo_log@result, p.adjust>=cutoff)
# to control colour of GO terms according to enrichment or depletion
d <- ifelse(sort(dadaOE_GSEgo_log$NES) > 0,
            cet_pal(2, name='cbd1')[2], cet_pal(2, name='cbd1')[1])
# plot
gsep <- dotplot(dadaOE_GSEgo_log,
                x='NES', 
                showCategory=number_of_terms,
                label_format=50,
                ) +
  ggtitle("Enriched/depleted GO terms in *esg > da:da*") +
  scale_colour_gradient(low = cet_pal(3, name='d2')[2],
                        high = cet_pal(3, name='d2')[3]) +
  labs(color = '-log~10~(*p*)') +
  #coord_fixed(ratio=0.8) +
  force_panelsizes(rows = unit(3.5,'in'), cols = unit(4,'in')) +
  theme(plot.title = element_markdown(),
        plot.title.position = 'panel',
        legend.title = element_markdown(),
        panel.grid = element_blank(),
        axis.text.y = element_text(size=7, lineheight=0.7, colour=d)
        )
#gsep$coordinates$ratio <- 0.01
suppressWarnings(print(gsep))

For scute overexpression:

# using `clusterProfiler` and `enrichplot`
# select reg set
scOE_rank <- make_degrank(ScOE_deg, mode='log2fc')
# GSEA
scOE_GSEgo <- gseGO(geneList=scOE_rank,
                    keyType = 'SYMBOL',
                    OrgDb=org.Dm.eg.db,
                    minGSSize    = 15,
                    maxGSSize    = 1000,
                    pvalueCutoff = 0.05,
                    nPermSimple = 1000000,
                    eps = 0,
                    verbose=F)
# get the logP values instead of padj 
scOE_GSEgo_log <- scOE_GSEgo
scOE_GSEgo_log@result$p.adjust <- -log10(scOE_GSEgo_log@result$p.adjust) # use @slot and $column to assign
# to subset the data to the highest NES values
#scOE_GSEgo_log@result <- subset(scOE_GSEgo_log@result, abs(NES)>1.8)
# to subset the data to the lowest p-values:
number_of_terms <- 30
cutoff <- rev(rev(sort(scOE_GSEgo_log$p.adjust))[1:number_of_terms])[1]
scOE_GSEgo_log@result <- subset(scOE_GSEgo_log@result, p.adjust>=cutoff)
# to control colour of GO terms according to enrichment or depletion
d <- ifelse(sort(scOE_GSEgo_log$NES) > 0,
            cet_pal(2, name='cbd1')[2], cet_pal(2, name='cbd1')[1])
# plot
gsep <- dotplot(scOE_GSEgo_log,
                x='NES', 
                showCategory=number_of_terms,
                label_format=60
                ) +
  ggtitle("Enriched/depleted GO terms in *esg > scute*") +
  scale_colour_gradient(low = cet_pal(3, name='d2')[2],
                        high = cet_pal(3, name='d2')[3]) +
  labs(color = '-log~10~(*p*)') +
  theme(plot.title = element_markdown(),
        plot.title.position = 'panel',
        legend.title = element_markdown(),
        panel.grid = element_blank(),
        axis.text.y = element_text(size=9, colour=d))

suppressWarnings(print(gsep))

I think that the signals are not strong enough to give anything meaningful. It may be better to use more ‘focused’ gene sets.